# -*- coding: utf-8 -*-
"""
Created on Thu Aug  1 14:10:19 2019

@author: rklotz

--- merges origin/destination port from E/C use in analysis
Updated to 04082020 version
"""

import numpy as np
import pandas as pd

#import datetime 
import pytz

pacific_tz = pytz.timezone("US/Pacific")

# tracks file (this includes all tracks)
#tracks_csv = r"C:\Users\rklotz\OneDrive\Work\Ports_ECAs\merge_entrance_clearance\CA_tracks_07062019.csv"
#tracks_csv = r"C:\Users\rklotz\OneDrive\Work\Ports_ECAs\merge_entrance_clearance\CA_tracks_07062019.csv"
#tracks_other_csv = r"H:\My Drive\Boats\Stata\CA_tracks\CA_tracks_wAP2_other_04082020.csv"
cols = ['track_id','TIME1','TIME2','PORT1','PORT2','RIGHT1','RIGHT2','AISyear','MMSI','IMO'
        ,'port_name1','port_agg1','port_name2','port_agg2','dirNW','INTERP_FLAG','VesselType','group_agg','Length','Width']

EC_cols = ['typedoc','year','month','day','port','port_name','imo_upd',
           'ec_country','ec_region','ec_port','where_port_agg','where_port_agg2','where_schedk','where_ind','ECport_agg','ECport_agg2','FORPORT_NAME']


def EC_merge( df_tracks_in , EC_dta , agg_region ) :
    
    # load ports file
    #ports_df = pd.read_excel(ports_decs)
    
    # load region file
    region_df = pd.read_excel(agg_region)
    
    # load entrance/clearance
    print("Loading E/C")
    df_EC = pd.read_stata(EC_dta,columns=EC_cols)
    df_EC['clearance'] = df_EC['typedoc']
    mdy = df_EC[['year','month','day']].astype('int32')
    df_EC['date'] = pd.to_datetime(mdy).dt.tz_localize(pacific_tz)
    df_EC['fromEC'] = 1
        
    # dropping 2007-2008
    df_EC = df_EC[df_EC.year>=2009]
    
    # dropping missing IMO
    df_EC = df_EC.loc[~df_EC.imo_upd.isnull()] # dropping missing IMO
    
    # adding broad where_port aggregation
    df_EC = df_EC.merge(region_df,left_on='where_port_agg2',right_on='where_port_agg_orig',how='inner')
    
    # removing duplicate entrances (same IMO entering same port area on same day (ie visiting to separate minor ports))
    print(len(df_EC))
    df_EC.drop_duplicates(subset=['imo_upd','date','clearance','ECport_agg'],keep='first',inplace=True)
    print(len(df_EC))
                
    # load tracks
    #print("Loading Tracks")
    df_tracks = df_tracks_in.loc[:,cols].copy()
    #df_tracks = pd.read_csv(tracks_main_csv,low_memory=False,usecols=cols)
    #df_tracks_other = pd.read_csv(tracks_other_csv,low_memory=False,usecols=cols)
    #df_tracks = pd.concat([df_tracks,df_tracks_other],axis=0,ignore_index=True,sort=True)
    #del df_tracks_other
    
    # flagging tracks where WC port is 
    # need the nan option here to get entrance/exits without boundary
    df_tracks['flag_WC1'] = ( (df_tracks.PORT1<=12) &  ( (df_tracks.PORT2>12) | (df_tracks.PORT2.isna())  )  )
    
    # converting to pacific time
    df_tracks['TIME1'] = pd.to_datetime( df_tracks['TIME1'] ).dt.tz_localize('UTC').dt.tz_convert(pacific_tz)
    df_tracks['TIME2'] = pd.to_datetime( df_tracks['TIME2'] ).dt.tz_localize('UTC').dt.tz_convert(pacific_tz)
        
    # creating time when vessel intersects with west coast port
    #df_tracks['WC1'] = df_tracks.PORT1<=12
    #df_tracks['WC2'] = df_tracks.PORT2<=12
    df_tracks['TIME'] = np.where( df_tracks.flag_WC1 , df_tracks.TIME1 , df_tracks.TIME2 )
    #df_tracks['TIME'] = pd.to_datetime( df_tracks['TIME'] ).dt.tz_localize('UTC').dt.tz_convert(pacific_tz)
    
    df_tracks['day1'] = df_tracks.TIME1.dt.day
    df_tracks['day2'] = df_tracks.TIME2.dt.day
    
    df_tracks['month1'] = df_tracks.TIME1.dt.month
    df_tracks['month2'] = df_tracks.TIME2.dt.month
    
    # flag entrance/exits
    #df_tracks['EnterExit'] = np.where( ( (df_tracks.PORT1<=12) | (df_tracks.PORT2<=12) ) & ( (df_tracks.PORT1>100) | (df_tracks.PORT2 >100) ) , 1  , 0 )
    
    # creating port for entrance/clearance
    # coastal routes would presumably have an entrance and a clearance
    # but we are not concerned about this since we already know origin/destination
    df_tracks['ECport'] = np.where( df_tracks.flag_WC1 , df_tracks.port_name1 , df_tracks.port_name2 )
        
    #df_tracks['IMO'] = df_tracks['IMO'].str.replace('IMO','').astype('int64')
    #df_tracks['sample_ind'] = ~ ( df_tracks.IMO.isna() | df_tracks.INTERP_FLAG==1 ) # dropping missing IMO
    #df_tracks['fromAIS'] = 1
    
    # getting entrances and exits to merge EC to
    #df_enter_exits = df_tracks.loc[df_tracks.EnterExit==1].copy()
    #df_tracks['clearance'] = df_tracks['dirNW']
    df_tracks['clearance'] = df_tracks['flag_WC1'] * 1 
    
    # merging with date buffer (merge on imo and port, then keep only if date gap less than 1 day )
    print('Merging')
    df_merged = pd.merge(df_tracks,df_EC,left_on=['IMO','ECport','clearance'],right_on=['imo_upd','ECport_agg','clearance'],how='outer')
    #df_merged = df_merged.loc[df_merged.fromEC.notna(),:].copy() # removing
    del df_tracks
        
    df_merged['diff_hours'] = ( df_merged.date - df_merged.TIME ).dt.total_seconds() / 3600
    df_merged['abs_diff_hours'] = abs(df_merged['diff_hours'])
    df_merged['match24'] = ( abs(df_merged['diff_hours'])<=24 )
    df_merged['match36'] = ( abs(df_merged['diff_hours'])<=36 )
    df_merged['match48'] = ( abs(df_merged['diff_hours'])<=48 )
    
    # reporting matches by different time bounds
    df_merged.groupby(['ECport'],as_index=False).match24.sum()
    df_merged.groupby(['ECport'],as_index=False).match36.sum()
    df_merged.groupby(['ECport'],as_index=False).match48.sum()
    
    df_min_match = df_merged.groupby(['track_id'],as_index=False).abs_diff_hours.min().add_suffix('_min')
    df_merged = df_merged.merge(df_min_match,left_on=['track_id'],right_on=['track_id_min'])
    del df_min_match
    
    # selecting matches
    # minimum time difference with certain time band
    df_match = df_merged.loc[( df_merged['match48']==1 ) & (df_merged.abs_diff_hours==df_merged.abs_diff_hours_min) , : ].copy()
    del df_merged
    
    # duplicate voyage (matched to multiple EC)
    df_match['dup_track'] = df_match.duplicated(['track_id'],keep=False)
    # duplicate EC (matched to multiple voyages)
    df_match['dup_EC'] = df_match.duplicated(['imo_upd','ECport_agg','clearance','date'],keep=False)
    print(len(df_match))
    print(df_match.dup_track.sum())
    print(df_match.dup_EC.sum())
    
    df_match = df_match.loc[df_match.dup_EC==False].copy()
    
    # create port variables with entrances and clearances
    df_match["port_name1_ECport"] = np.where( ( df_match['flag_WC1']==False ) , df_match.ec_port , df_match.port_name1 )
    df_match["port_name2_ECport"] = np.where( ( df_match['flag_WC1']==True ) , df_match.ec_port , df_match.port_name2 )
    
    df_match["port_name1_ECcountry"] = np.where( ( df_match['flag_WC1']==False )  , df_match.ec_country , df_match.port_name1 )
    df_match["port_name2_ECcountry"] = np.where( ( df_match['flag_WC1']==True )  , df_match.ec_country , df_match.port_name2 )
    
    df_match["port_name1_ECregion"] = np.where( ( df_match['flag_WC1']==False )  , df_match.ec_region , df_match.port_name1 )
    df_match["port_name2_ECregion"] = np.where( ( df_match['flag_WC1']==True )  , df_match.ec_region , df_match.port_name2 )
    
    
    #a = df_match[(df_match.group_agg=="FCC") ].groupby(['port_name1_ECcountry','port_name2_ECcountry'],as_index=False).track_id.count()
    #a = df_match[(df_match.group_agg=="FCC") & (df_match.dirNW==1) & (df_match.PORT1.isin([1,2,3,4,6,7,8,9]))].groupby(['ec_port'],as_index=False).track_id.count()
    
    
    # merge matches back to clean full data set 
    vars_to_merge = ["track_id","port_name1_ECport","port_name2_ECport","port_name1_ECcountry","port_name2_ECcountry","port_name1_ECregion","port_name2_ECregion"]  
    df_match = df_match.loc[:,vars_to_merge]
    return df_match 


if __name__ == '__main__':
    
    # tracks file
    #tracks_main_csv = r"H:\My Drive\Boats\Stata\CA_tracks\CA_tracks_wAP2_04082020.csv"
    tracks_main_csv = r'C:\Users\rklotz\Documents\data\lines_v2\processed_lines_speed\tracks_speed.csv'
    
    # more aggregate region definitions
    agg_region = r"C:\Users\rklotz\OneDrive\Work\Ports_ECAs\merge_entrance_clearance\ECport_agg.xlsx"

    # EC data after being cleaned in stata
    EC_dta = r"H:\My Drive\Boats\entrance_clearance\ec0716_clean.dta"

    # port names xlsx file
    #ports_decs = r"H:/My Drive/Boats/Stata/CA_tracks/port_decs_wAK_HI.xlsx"
    print("Loading Tracks")
    # WANT JUST ENTRANCE/EXITS
    df_tracks_in = pd.read_csv(tracks_main_csv,low_memory=False)
    #df_tracks_in['EnterExit'] = np.where( ( (df_tracks_in.PORT1<=12) | (df_tracks_in.PORT2<=12) ) & ( (df_tracks_in.PORT1>100) | (df_tracks_in.PORT2 >100) ) , 1  , 0 )
    #df_enter_exits = df_tracks_in.loc[df_tracks_in.EnterExit==True,:].copy()
    
    df_enter_exits = df_tracks_in.copy()
    
    
    
    df_ECmatch=EC_merge(df_enter_exits , EC_dta , agg_region )
    
    df_tracks = df_tracks_in.merge(df_ECmatch,on='track_id',how='left')
    df_tracks['ECmatch'] = df_tracks.port_name1_ECport.notna()
        
    #### NOW LOOKING AT SUMMARY STATS
    # generate statistics for which vessels we obtain origin/destinations for 
    df_group = pd.DataFrame( [["FCC","FCC"],["Bulker","OtherCargo"],["Other Cargo","OtherCargo"],["Tankers","Tankers"]] , columns = ['group_agg','group_agg2'] )
    
    df_tracks = df_tracks.merge(df_group,on="group_agg",how='left')
    
    df_tracks['WCPORT'] = np.where(df_tracks.PORT1>100,df_tracks.port_name2,df_tracks.port_name1)
    df_tracks['FORPORT'] = np.where(df_tracks.PORT1>100,df_tracks.port_name1_ECport,df_tracks.port_name2_ECport)
    df_tracks['FORREGION'] = np.where(df_tracks.PORT1>100,df_tracks.port_name1_ECregion,df_tracks.port_name2_ECregion)
    df_tracks['FORCOUNTRY'] = np.where(df_tracks.PORT1>100,df_tracks.port_name1_ECcountry,df_tracks.port_name2_ECcountry)
    
    df_tracks['has_imo'] = df_tracks.IMO.notna()
    
    # across all entrance/exits in our data set
    # these are only those that have IMO numbers...
    df_tracks[(df_tracks.EnterExit==1) & (df_tracks.has_imo==True) ].groupby(['group_agg2'],as_index=False).ECmatch.mean()
    # we recover 78% of origin/destinations for FCCs
    #            48% for Tankers
    #            51% for other cargo
    
    # across all entrance/exits in our data set, by year
    a = df_tracks[(df_tracks.EnterExit==1)  & (df_tracks.has_imo==True)  ].groupby(['group_agg2','AISyear'],as_index=False).ECmatch.mean()
    # pretty consistent - although falls a bit in 2013--2015 (due to IMO matching?)
    
    # broken down by port
    df_tracks[(df_tracks.EnterExit==1)  & (df_tracks.has_imo==True)  ].groupby(['group_agg2','WCPORT'],as_index=False).ECmatch.mean()
        # LALB 86% of FCC, 68% other cargo
        # SF Bay 82% of FCC, 80% of other cargo
    # much lower for seattle (because many going to Canadian ports)
        # 48% of FCC, 20% of other cargo    
    a = df_tracks[(df_tracks.EnterExit==1) & (df_tracks.group_agg=="FCC") ].groupby(['FORREGION','FORCOUNTRY','FORPORT'],as_index=False).track_id.count()
    
    